from langchain.text_splitter import RecursiveCharacterTextSplitter
from tool import custom_print

"""
格式化长文本

"""
with open('test4.txt', encoding='utf-8') as f:
    state_of_the_union = f.read()

# 采用 段 > 行 > 句 > 硬分割 的规则
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "。", ""],
    chunk_size=70,
    chunk_overlap=0,
    length_function=len,
    keep_separator=False
)

texts = text_splitter.create_documents([state_of_the_union])
custom_print.print_all(texts)
